| No. | Name | Meaning | Data Name | Short Data Name | Analytic Type | Data Type | Unit Of Measure | Variable Type | Description / Comments |
|---|---|---|---|---|---|---|---|---|---|
| 1 | Cement | An Ingredient | cement | cement | Quantitative | Numeric | Kg/m3 mix | Input | Predictor Variable |
| 2 | Blast Furnace Slag | An Ingredient | slag | slag | Quantitative | Numeric | Kg/m3 mix | Input | Predictor Variable |
| 3 | Fly Ash | An Ingredient | ash | ash | Quantitative | Numeric | Kg/m3 mix | Input | Predictor Variable |
| 4 | Water | An Ingredient | water | water | Quantitative | Numeric | Kg/m3 mix | Input | Predictor Variable |
| 5 | Superplasticizer | An Ingredient | superplastic | splast | Quantitative | Numeric | Kg/m3 mix | Input | Predictor Variable |
| 6 | Coarse Aggregate | An Ingredient | coarseagg | corse | Quantitative | Numeric | Kg/m3 mix | Input | Predictor Variable |
| 7 | Fine Aggregate | An Ingredient | fineagg | fine | Quantitative | Numeric | Kg/m3 mix | Input | Predictor Variable |
| 8 | Age | Days from Casting | age | age | Quantitative | Numeric | Day (1...365) | Input | Predictor Variable |
| 9 | Concrete Compressive Strength | Measure of Strength | strength | mpa | Quantitative | Numeric | MPa (Mega Pascal) | OUTPUT | PREDICTED Variable |
###### D1.1: Univariate Analysis: 10 Marks:
Data types and description of the independent attributes which should include (name, meaning, range of values observed, central values (mean and median), standard deviation and quartiles, analysis of the body of distributions / tails, missing values, outliers.
###### D1.2: Bi-variate Analysis: 10 Marks:
Analyze Among Predictor Variables and Between Predictor & Target Columns. Comment on your findings in terms of their Relationship and Degree of Relation if any. Visualize the analysis using Boxplots and Pair Plots with Histograms or Density Curves.
###### D1.3: Feature Engineering Techniques: 10 Marks:
# v====== Standard Libraries Begin ======v #
import warnings
warnings.filterwarnings('ignore')
import numpy as np # Numerical Python libraries
# random_state = np.random.RandomState(0) # From Mukesh Rao. MSB: Do we need to do this? Working ok without it.
import pandas as pd # to handle data in form of rows and columns
import pandas_profiling
import pylab as pl # Mukesh Rao
import seaborn as sns # Data visualization for statistical graphics
import matplotlib.pyplot as plt # Data visualization for Ploting
from sklearn.svm import SVC # M.Rao; SVC = Support Vector Classification
from sklearn import metrics, svm # "svm" = Support Vector Machine > MRao; For Lin/Log Regr, DTree
from sklearn.impute import SimpleImputer
from sklearn.utils import resample, shuffle # "shuffle"=> Mukesh Rao; Bagging Sample data set creation
from sklearn.model_selection import train_test_split, KFold, cross_val_score, LeaveOneOut, GridSearchCV, RandomizedSearchCV # Lin/LogR, DTree
from sklearn.pipeline import Pipeline, make_pipeline # M.Rao
from sklearn.neighbors import KNeighborsClassifier # MRao
# For Linear Dimensionality (Cols/Attributes) Reduction to a Lower dimensional space (eg: reduce 15 cols to 2 cols):
from sklearn.decomposition import PCA # "Principal Component Analysis" for "Singular Value Decomposition" (SVD)
# ClusterCentroids=Cluster based UNDERsampling, TomekLinks=Under sampling by Deleting nearest majority neighbor/similar rows
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, TomekLinks
from imblearn.over_sampling import SMOTE # Over sampler
from imblearn.combine import SMOTETomek # OVER / UP Sampling followed by UNDER / DOWN Sampling
from mlxtend.feature_selection import SequentialFeatureSelector as sfs # For Features selection
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs # For Plotting
# ====== For Linear Regression ======
from scipy.stats import zscore, pearsonr, randint as sp_randint
from category_encoders import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, binarize, LabelEncoder, OneHotEncoder # M.Rao
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
# Import Linear Regression machine learning library:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score # LinRegr
import statsmodels.api as sm # For OLS Summary in Linear Regression
import statsmodels.formula.api as smf # For OLS Summary in Linear Regression
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.classifier import ClassificationReport, ROCAUC
# ====== For Logistic Regression ======
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score
from sklearn.metrics import f1_score, roc_curve, roc_auc_score, classification_report, auc # Mukesh Rao
# ====== For Decision Tree ======
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
# from sklearn.externals.six import StringIO # Discontinued in Scikit Version 0.23 (available only upto Ver 0.22)
import pydotplus as pdot # to display decision tree inline within the notebook
import graphviz as gviz
# DTree does not take strings as… # … input for the model fit step....
from sklearn.feature_extraction.text import CountVectorizer
# ======= For Ensemble Techniques =======
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
# ======= Set default style ========
# Multiple output displays per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import Image, Markdown
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>")) # Increase cell width
# ===== Options =====
pd.options.display.float_format = '{:,.2f}'.format # Remove scientific notations to display numbers with 2 decimals
pd.set_option('display.max_columns', 100) # Max df cols to display set to 100.
pd.set_option('display.max_rows', 50) # Max df rows to display set to 50.
# pd.set_option('display.max_rows', tdf.shape[0]+1) # just one row more than the total rows in df
# Update default style and size of charts
plt.figure(figsize=(12,8))
plt.style.use('ggplot') # plt.style.use('classic') ??
plt.rcParams['figure.figsize'] = [10, 8]
sns.set_style(style='darkgrid')
%matplotlib inline
import pickle # For model export
from os import system # For system (eg MacOS, etc) commands from within python
# ====== Standard Libraries End ======^ #
# Read & Load the input Datafile into dataset frame: Concrete DataFrame:
cdf = pd.read_csv('concrete.csv')
cdf
# My Housekeeping: Incremental DF Data Backup 0 as of now:
cdf0 = cdf.copy() # Original Df
cdf.to_csv('cdf0.csv') # Also export as .csv file to disk
! ls -l cdf*
# Verify backup copy
cdf0.shape, type(cdf0)
cdf0.sample(7)
# My Housekeeping: Rename column names for convenience, meaningfulness and intuitiveness:
cdf.rename(columns={'superplastic': 'splast', 'coarseagg': 'corse', 'fineagg': 'fine', 'strength': 'mpa'},
inplace=True, errors='raise')
cdf.sample(6)
# My Housekeeping: Incremental DF Data Backup 1 as of now:
cdf1 = cdf.copy() # Modified Df: Changed Columns Names to shorten: 'splast', 'corse', 'fine', 'mpa'
cdf1.to_csv('cdf1.csv') # Also export as .csv file to disk
! ls -l cdf*
# Verify backup copy
cdf1.shape, type(cdf1)
cdf1.sample(6)
Data types & description of the independent attributes which should include (name, meaning, range of values observed, central values (mean and median), standard deviation and quartiles, analysis of the body of distributions / tails, missing values, outliers.
profile = pandas_profiling.ProfileReport(cdf)
profile
### My Housekeeping: Jupyter Code File Incremental Backup 1 Tue.Jun.30 2:47am ^^^
Markdown("### Incremental Jupyter Notebook Code Backup 1")
# ! cp "Project 4 FEMST Concrete Strength Predict.ipynb" "Project 4 FEMST Concrete Strength Predict Backup 1.ipynb"
! ls -l Project*.ipynb
Markdown('### * DF Head & Tail rows:')
cdf
Markdown('### * DF Random Sample rows:')
cdf.sample(7)
Markdown('### * DF Shape: Number of (Rows, Columns), DF Type:')
cdf.shape, type(cdf)
Markdown('### * DF Info with: Column Names & Data Types')
cdf.info()
Markdown('### * DF Stats for Numerical value Cols: Range (Min & Max), Central values (Mean), Std.D, Quartiles')
cdf.describe()
Markdown('### * DF Stats for Numerical value Cols: Central values: MEDIAN which are not in Standard "Describe" above^:')
cdf.median()
Markdown('### * DF Stats for All Cols: Central values: MODE which are not in Standard "Describe" above^:')
cdf.mode()
Markdown(""" ### * $ \ {dup}$ = Duplicate <u>Columns</u> Based on ALL Rows """.format(dup=cdf.T.duplicated().sum()))
Markdown(""" ### * ${dup}$ = Duplicate <u>Rows</u> Based on ALL 9 Columns """.format(dup=cdf.duplicated().sum()))
Markdown('### * DF Duplicated <u>Rows</u> Based on CERTAIN "Mixture Ingredients" <u>Columns</u>:')
print(' ', cdf.iloc[:,:8].duplicated().sum(),
'= Dup. Rows for first 8 Cols: cement, slag, ash, water, splast, corse, fine, age. (Except: "mpa")')
print('', cdf.iloc[:,:7].duplicated().sum(),
'= Dup. Rows for first 7 Cols: cement, slag, ash, water, splast, corse, fine. (Except: "age" & "mpa")')
print(' ', cdf.duplicated(['cement', 'slag', 'ash', 'water', 'splast', 'corse', 'fine', 'mpa']).sum(),
'= Dup. Rows for first 7 & "mpa" Cols: cement, slag, ash, water, splast, corse, fine, mpa. (Except: "age")')
Markdown('### * DF Unique Values for All Columns:')
cdf.nunique()
Markdown('### * DF Null values for All Columns:')
cdf.isna().sum()
Markdown('### * DF NON Numeric Values in Numerical Columns:')
cdf[~cdf.select_dtypes(include='number').applymap(np.isreal).all(1)].count() # With "~" for NOT Real Numbers (Non Numeric)
Markdown('### * DF Zero Values in Numerical Columns:')
(cdf.select_dtypes(include='number') == 0).sum()
Markdown('### * DF Negative (-ve) Values in Numerical Columns:')
(cdf.select_dtypes(include='number') < 0).sum()
Markdown('### * DF Skewness for All Numerical Columns:')
cdf.skew()
Markdown('###### * Note: Skew Categories (Arbitrary): ')
print('''If distribution Skewness is between following ranges then Skewness for Column is...:
* High : < −1 OR > +1 Asymmetric Col : age
* Moderate : −1 & −0.5 OR +0.5 & +1 Asymmetric Cols: cement, slag, ash, splast
* Low : −0.5 & +0.5 Asymmetric Col : mpa
* V.Low : −0.25 & +0.25 Symmetric Col : fine
* No Skew : 0.0 OR near -0.0+ Symmetric Cols: water, corse
''')
### My Housekeeping: Jupyter Code File Incremental Backup 2 Sun.Jul.05 9:49pm ^^^
Markdown("### Incremental Jupyter Notebook Code Backup 2")
! cp "Project 4 FEMST Concrete Strength Predict.ipynb" "Project 4 FEMST Concrete Strength Predict Backup 2.ipynb"
! ls -l Project*.ipynb
# Indetify Outlier Values in All Numerical Columns:
Markdown('### * DF Number of Outliers for Numerical Columns: $ \ \ Low = Q1 - (IQR * 1.5) $; $ \ \ High = Q3 + (IQR *1.5 $)')
for col in cdf.select_dtypes(include='number'):
q1 = cdf[col].quantile(.25)
q3 = cdf[col].quantile(.75)
otr = (q3 - q1) * 1.5
otl = int(q1 - otr)
oth = int( q3 + otr)
otls = (cdf[col] < otl).sum()
oths = (cdf[col] > oth).sum()
print('*', col)
print(' ', str(otls).rjust(2, ' '), 'outliers Under Low End', str(otl).rjust(5, ' '))
print(' ', str(oths).rjust(2, ' '), 'outliers Over High End', str(oth).rjust(5, ' '))
print()
# Histogram & Density of Entire Dataset: UnScaled & Scaled (zscore): Visual Distribution of DF values:
p = plt.figure(figsize=(20,7))
p = plt.subplot(1, 2, 1)
g = sns.distplot(cdf)
p = plt.subplot(1, 2, 2)
g = sns.distplot(cdf.corr())
# Individual Histogram of ALL 9 Numerical Columns: Visual Distribution of column values:
# catch plt sns outputs in some variables ("p" & "g") to supress informational output
p = plt.figure(figsize=(20, 20))
pos = 1
for col in cdf.columns:
p = plt.subplot(3, 3, pos)
g = sns.distplot(cdf[col])
pos += 1
Skewness: Scale
Very few outliers: Impute / Scale.
### My Housekeeping: Jupyter Code File Incremental Backup 3 Mon.Jul.06 7:39am ^^^
Markdown("### Incremental Jupyter Notebook Code Backup 3")
! cp "Project 4 FEMST Concrete Strength Predict.ipynb" "Project 4 FEMST Concrete Strength Predict Backup 3.ipynb"
! ls -l Project*.ipynb
Analyze Among Predictor Variables and Between Predictor & Target (Predicted) Columns. Comment on your findings in terms of their Relationship and Degree of Relation if any. Visualize the analysis using Boxplots and Pair Plots with Histograms or Density Curves.
Markdown('##### * PairPlot BiVariate Study among Predictor variables & with Predicted/Target column, with Density Curves (diag_kind="kde")')
g = sns.pairplot(cdf, diag_kind='kde')
Markdown('##### * Individual Box Plots of ALL 9 Numerical Columns: Visuals For Quartiles, Middle & Outlier values:')
p = plt.figure(figsize=(20, 20))
pos = 1
for col in cdf.columns:
p = plt.subplot(3, 3, pos)
g = sns.boxplot(cdf[col])
pos += 1
# "Age" has 14 (limited) "bin like" non continous numerical values, though it can have continous values from 1 to 365
# Hence we can use BoxPlot & PointPlot to do BiVariate study between predictor "age" col and "mpa" col (predicted/target):
Markdown('### <center> * BoxPlot & PointPlot : BiVariate Study of "age" & "mpa" * </center>')
p = plt.figure(figsize=(20, 20))
g = sns.boxplot('age', 'mpa', data=cdf)
g = sns.pointplot(cdf.age, cdf.mpa)
# Additional Exhibit 1:
Markdown('### * Correlation Matrix:')
cdf.corr()
Markdown('### * Correlation HeatMap Matrix:')
p = plt.figure(figsize=(11, 7))
g = sns.heatmap(cdf.corr())
# Additional Exhibit 2:
# Density, Histogram, Scatter plots of All Attributes Interactions:
# Different Upper & Lower triangles: eg. Scatter & KDE (Density)
Markdown('### * PairGrid BiVariate Study among Predictor variables & with Predicted/Target column')
Markdown('* <u> Upper Half</u> : LINEAR REGRESSION Fitted line thru Scatter Plot')
Markdown('* <u> Lower Half</u> : Kernel Densities among Atributes')
g = sns.PairGrid(cdf)
g = g.map_upper(sns.regplot)
g = g.map_lower(sns.kdeplot)
g = g.map_diag(plt.hist, lw=2)
plt.show()
# Additional Exhibit 3:
Markdown('##### * BiVariate Study of Predicted/Target "mpa" (y) column with remaining 8 (X) Predictor attributes:')
Markdown('''* Regression Line Fitted thru Scatter Plot For:
1=Linear, 2=Quadratic, 3=Cubic. Using regplot(X, y) for order=1,2,3, Confidence Interval ci = 95 (line shadows)''')
p = plt.figure(figsize=(20, 20))
pos = 1
for col in cdf.columns:
p = plt.subplot(3, 3, pos)
g = sns.regplot(col, 'mpa', data=cdf, order=1)
g = sns.regplot(col, 'mpa', data=cdf, order=2)
g = sns.regplot(col, 'mpa', data=cdf, order=3)
p = plt.legend(['Linear','Quadratic','Cubic'], loc="best")
pos += 1
### My Housekeeping: Jupyter Code File Incremental Backup 4 Wed.Jul.08 12:55am ^^^
Markdown("### Incremental Jupyter Notebook Code Backup 4")
! cp "Project 4 FEMST Concrete Strength Predict.ipynb" "Project 4 FEMST Concrete Strength Predict Backup 4.ipynb"
! ls -l Project*.ipynb
Relationship & Degree: Correlations: None are HIGH (-0.80+): Column(s) DROP = None
Data Dispersion: Multiple Gaussians (Humps) in Distributions: SCALE Data = Yes
Data Dispersion: Regression Estimate: SCALE Data = Yes; FE/MT = Polynomials, Adv.Regr.Models, HyperTune
Visialized:
Analyzed: Interation / Relationship among Predictor Variables AND between Predictors & Predicted/Target ("mpa") column. Observations and Findings on Relationship & Degree Relation provided above^
### My Housekeeping: Jupyter Code File Incremental Backup 5 Wed.Jul.08 11:23pm ^^^
Markdown("### Incremental Jupyter Notebook Code Backup 5")
! cp "Project 4 FEMST Concrete Strength Predict.ipynb" "Project 4 FEMST Concrete Strength Predict Backup 5.ipynb"
! ls -l Project*.ipynb
# Drop Duplicate Rows (25) as determined earlier in UniVariate section:
cdf1.shape # before
cdf.drop_duplicates(keep='first', inplace=True)
cdf.shape # after
print('Rows Dropped:', len(cdf1)-len(cdf))
# My Housekeeping: Incremental DF Data Backup 2 as of Thu.Jul.09 3:29am :
cdf2 = cdf.copy() # Modified Df: Dropped 25 Duplicated Rows
cdf2.to_csv('cdf2.csv') # Also export as .csv file to disk
! ls -l cdf*
# Verify backup copy
cdf2.shape, type(cdf2)
cdf2.sample(5)
# Impute All Outliers based on IQR Method with respective columns' Q1 or Q3 values:
Markdown('### * DF Outliers Imputations with Q1 (Under Low End) or Q3 (Over High End) Values for All Numerical Columns: $ \ \ Low = Q1 - (IQR * 1.5) $; $ \ \ High = Q3 + (IQR *1.5 $)')
for col in cdf.select_dtypes(include='number'):
q1 = cdf[col].quantile(.25)
q3 = cdf[col].quantile(.75)
otr = (q3 - q1) * 1.5
otl = round(q1 - otr,2)
oth = round(q3 + otr,2)
otls = (cdf[col] < otl).sum()
oths = (cdf[col] > oth).sum()
cdf[col] = np.where((cdf[col] < otl), q1, cdf[col])
cdf[col] = np.where((cdf[col] > oth), q3, cdf[col])
print('*', col, ':', otls+oths, 'values imputed with Q1, Q3 value:', otl, oth)
print(' ', str(otls).rjust(2, ' '), 'outliers Under Low End', str(otl).rjust(5, ' '))
print(' ', str(oths).rjust(2, ' '), 'outliers Over High End', str(oth).rjust(5, ' '))
print()
# Post Imputation: Indetify New Outlier Values in All Numerical Columns:
Markdown('### * DF New Outliers After Imputations for Numerical Columns: NO OUTLIERS: \n $ \ \ Low = Q1 - (IQR * 1.5) $; $ \ \ High = Q3 + (IQR *1.5 $)')
for col in cdf.select_dtypes(include='number'):
q1 = cdf[col].quantile(.25)
q3 = cdf[col].quantile(.75)
otr = (q3 - q1) * 1.5
otl = round(q1 - otr,2)
oth = round(q3 + otr,2)
otls = (cdf[col] < otl).sum()
oths = (cdf[col] > oth).sum()
print('*', col)
print(' ', str(otls).rjust(2, ' '), 'outliers Under Low End', str(otl).rjust(5, ' '))
print(' ', str(oths).rjust(2, ' '), 'outliers Over High End', str(oth).rjust(5, ' '))
print()
# Verify Outliers with BoxPlot:
Markdown('##### * Verify Outliers with BoxPlot: NO OUTLIERS:')
p = plt.figure(figsize=(20, 7))
g = cdf.boxplot()
# My Housekeeping: Incremental DF Data Backup 3 as of Thu.Jul.09 8:01am :
cdf3 = cdf.copy() # Modified Df: Imputed 99 Outlier values to Q1 and/or Q3 column values respectively as applicable.
cdf3.to_csv('cdf3.csv') # Also export as .csv file to disk
! ls -l cdf*
# Verify backup copy
cdf3.shape, type(cdf3)
cdf3.sample(5)
### My Housekeeping: Jupyter Code File Incremental Backup 6 Thu.Jul.09 08:04am ^^^
Markdown("### Incremental Jupyter Notebook Code Backup 6")
! cp "Project 4 FEMST Concrete Strength Predict.ipynb" "Project 4 FEMST Concrete Strength Predict Backup 6.ipynb"
! ls -l Project*.ipynb
cdf1
# Prepare data for split: Create X, y (Predictor, Predicted) datasets:
# X = cdf1.copy() # Contains: 25 duplicates; 99 Outliers, i.e. Before removing dups & outliers
X = cdf.copy() # Dups & Outliers Removed
y = X.pop('mpa')
# Split df data into 3 datasets: Train, Test, Validate:
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.2, random_state=3)
X_trn, X_val, y_trn, y_val = train_test_split(X_trn, y_trn, test_size=0.25, random_state=3)
X.shape, y.shape, X_trn.shape, X_tst.shape, X_val.shape, y_trn.shape, y_tst.shape, y_val.shape
type(X_trn), type(X_tst), type(X_val), type(y_trn), type(y_tst), type(y_val)
lrm = LinearRegression()
lrm.fit(X_trn, y_trn)
print('\n* UnScaled Data: Coefficients:', lrm.coef_)
print('\n* UnScaled Data: Intercept:', lrm.intercept_)
Markdown('###### * UnScaled Data: $R^2$ Score for: In Sample & Out Of Sample (Validaton dataset):')
print(' * In Sample R^2:', lrm.score(X_trn, y_trn), 'Out Of Sample R^2:', lrm.score(X_val, y_val))
print(' * Root Mean Square Error RMSE:', mean_squared_error(y_val, lrm.predict(X_val))**0.5)
# print(lrm.summary())
# print('Predict y on Validation dataset:', lrm.predict(X_val).reshape(-1,))
std_sclr = StandardScaler()
X_trn = std_sclr.fit_transform(X_trn)
X_val = std_sclr.fit_transform(X_val)
X_trn = pd.DataFrame(X_trn)
X_val = pd.DataFrame(X_val)
y_trn = pd.DataFrame(y_trn)
y_val = pd.DataFrame(y_val)
y_trn = std_sclr.fit_transform(y_trn)
y_val = std_sclr.fit_transform(y_val)
X_trn.columns = X.columns
X_val.columns = X.columns
lrm_s = LinearRegression()
lrm_s.fit(X_trn, y_trn)
# X_train = scaler.fit_transform(X_train)
# X_vaid = scaler.transform(X_valid)
# X_test = scaler.transform(X_test)
# model.fit(X_train,y_train)
# y_pred_valid = model.predict(X_valid).reshape(-1,) # array.reshape(-1, 1)
print('\n* SCALED Data: Coefficients:', lrm_s.coef_)
print('\n* SCALED Data: Intercept:', lrm_s.intercept_)
Markdown('###### * SCALED Data: $R^2$ Score for In Sample, Out Of Sample (Validaton dataset: SCALED Data):')
print(' * In Sample R^2:', lrm_s.score(X_trn, y_trn), 'Out Of Sample R^2:', lrm_s.score(X_val, y_val))
print(' * Root Mean Square Error RMSE:', mean_squared_error(y_val, lrm_s.predict(X_val))**0.5)
# print(lrm_s.summary())
# print('Predict y on Validation dataset:', lrm.predict(X_val).reshape(-1,))
# print('Predict y on Train dataset:', lrm.predict(X_trn).reshape(-1,))
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_trn_p2 = poly.fit_transform(X_trn)
X_val_p2 = poly.fit_transform(X_val)
poly2m = LinearRegression()
poly2m.fit(X_trn_p2, y_trn)
y_pred_poly2 = poly2m.predict(X_val_p2)
# print(y_pred_poly2)
print('* Data SCALED & CLEANED (No Dups, No Outliers):')
print('* R^2 Score: In Sample:', poly2m.score(X_trn_p2, y_trn), 'R^2 Score: Out Of Sample:', poly2m.score(X_val_p2, y_val))
print('* Root Mean Square Error RMSE:', mean_squared_error(y_val, poly2m.predict(X_val_p2))**0.5)
print('* Additional Features/Columns (Polynomials) Created:', X_trn_p2.shape[1] - X_trn.shape[1])
poly = PolynomialFeatures(degree=3, interaction_only=True)
X_trn_p3 = poly.fit_transform(X_trn)
X_val_p3 = poly.fit_transform(X_val)
poly3m = LinearRegression()
poly3m.fit(X_trn_p3, y_trn)
y_pred_poly3 = poly3m.predict(X_val_p3)
# print(y_pred)
print('* Data SCALED & CLEANED (No Dups, No Outliers):')
print('* R^2 Score: In Sample:', poly3m.score(X_trn_p3, y_trn), 'R^2 Score: Out Of Sample:', poly3m.score(X_val_p3, y_val))
print('* Root Mean Square Error RMSE:', mean_squared_error(y_val, poly3m.predict(X_val_p3))**0.5)
print('* Additional Features/Columns (Polynomials) Created:', X_trn_p3.shape[1] - X_trn.shape[1])
Use the Algorithms that you think will be suitable for this project (at least 3 algorithms). Use ***Kfold Cross Validation*** to evaluate model performance. Use appropriate metrics and make a DataFrame to compare models w.r.t their metrics.
# As we just concluded above, we will revert back to the Orifinal df WITHOUT any preprocessing applied for dups, outliers.
# Also we will split dfs again:
# Prepare data for split: Create X, y (Predictor, Predicted) datasets times 3 for Train, Validation, Test
# X = cdf.copy() # Dups & Outliers Removed
X = cdf1.copy() # Contains: 25 duplicates; 99 Outliers, i.e. Before removing Dups & Outliers
y = X.pop('mpa')
# Split df data into 3 datasets: Train, Test, Validate:
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.2, random_state=3)
X_trn, X_val, y_trn, y_val = train_test_split(X_trn, y_trn, test_size=0.25, random_state=3)
X.shape, y.shape, X_trn.shape, X_tst.shape, X_val.shape, y_trn.shape, y_tst.shape, y_val.shape
type(X_trn), type(X_tst), type(X_val), type(y_trn), type(y_tst), type(y_val)
# Trying Decision Tree:
dTree = DecisionTreeRegressor(random_state=6)
dTree.fit(X_trn, y_trn)
print(dTree.score(X_trn, y_trn))
print(dTree.score(X_val, y_val))
# Trying Decision Tree: PRUNED:
dTreeR = DecisionTreeRegressor(max_depth=9, min_samples_leaf=3, random_state=6)
dTreeR.fit(X_trn, y_trn)
print(dTreeR.score(X_trn, y_trn))
print(dTreeR.score(X_val, y_val))
print (pd.DataFrame(dTree.feature_importances_, columns = ["Imp DTree UnPruned"], index = X_trn.columns))
print (pd.DataFrame(dTreeR.feature_importances_, columns = ["Imp DTree Pruned"], index = X_trn.columns))
# Random Forest Classifier Learning Model:
rfcl = RandomForestRegressor(n_estimators = 50, random_state=6, max_features=8)
rfcl = rfcl.fit(X_trn, y_trn)
# y_predict = rfcl.predict(X_test)
# y_predict_rf = y_predict
# print(rfcl.score(X_test, y_test))
print(rfcl.score(X_trn, y_trn))
print(rfcl.score(X_val, y_val))
# Build AdaBooster Learning Model:
abcl = AdaBoostRegressor(n_estimators=90, random_state=6, learning_rate=0.6)
abcl = abcl.fit(X_trn, y_trn)
# y_predict = abcl.predict(X_test)
# y_predict_ab = y_predict
# print(abcl.score(X_test , y_test))
print(abcl.score(X_trn, y_trn))
print(abcl.score(X_val, y_val))
# Gradient Boost Learning Model:
gbcl = GradientBoostingRegressor(n_estimators = 40, random_state=6, learning_rate=0.3)
gbcl = gbcl.fit(X_trn, y_trn)
# y_predict = gbcl.predict(X_test)
# y_predict_gb = y_predict
# print(gbcl.score(X_test, y_test))
print(gbcl.score(X_trn, y_trn))
print(gbcl.score(X_val, y_val))
# Build Bagging Learning Model:
bgcl = BaggingRegressor(base_estimator=dTree, n_estimators=12, random_state=6)
bgcl = bgcl.fit(X_trn, y_trn)
# y_predict = bgcl.predict(X_test)
# y_predict_bg = y_predict
# print(bgcl.score(X_test , y_test))
print(bgcl.score(X_trn, y_trn))
print(bgcl.score(X_val, y_val))
kfold = KFold(n_splits=50, random_state=6)
model = GradientBoostingRegressor()
results = cross_val_score(model, X, y, cv=kfold) # , scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
kfold = KFold(n_splits=50, random_state=6)
model = RandomForestRegressor()
results = cross_val_score(model, X, y, cv=kfold) # , scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
| Regression Algorithm | Other Metrics | R^2 Train DataSet | R^2 Validation Dataset | Comments |
|---|---|---|---|---|
| Simple Linear | RMSE: 8.898078241713177 | 0.7330438038819633 | 0.680319427895178 | Data: UnScaled, Cleaned |
| Simple Linear | RMSE: 0.5638396157514333 | 0.7330438038819633 | 0.682084887709276 | Data: Scaled, Cleaned |
| Polynomial LinRegr: Degree=2 | RMSE: 0.5258500933378105 | 0.7922581527934405 | 0.7234816793366159 | 29 New Features Added. Data: Scaled, Cleaned |
| Polynomial LinRegr: Degree=3 | RMSE: 0.563631653160918 | 0.8414541656579173 | 0.6823193595550907 | 85 New Features Added. Data: Scaled, Cleaned |
| Decision Tree, UnPrunned | 0.9999571857586979 | 0.8072407565692836 | Original Data: UnScaled, Not PreProcessed: Contains 25 Duplicate Rows; 99 Outliers | |
| Decision Tree, Prunned: Depth=3 | 0.943778925044059 | 0.7649833982162988 | ^^^ Ditto ^^^ | |
| Random Forest | 0.9865892384207504 | 0.8997314457247279 | ^^^ Ditto ^^^ | |
| AdaBoost | 0.8267219994089168 | 0.8000375936309319 | ^^^ Ditto ^^^ | |
| Gradient Boost | 0.9639792909595184 | 0.90229938952186 | ^^^ Ditto ^^^ | |
| Bagging | 0.9837672362553681 | 0.8981940858966639 | ^^^ Ditto ^^^ | |
| KFold for GradientBoostingRegressor | Accuracy: 89.403% (5.606% Std.Div.) | ^^^ Ditto ^^^ | ||
| KFold for RandomForestRegressor | Accuracy: 91.229% (5.695% Std.Div.) | ^^^ Ditto ^^^ | ||
### My Housekeeping: Jupyter Code File Incremental Backup 8 Sat.Jul.11 12:25am ^^^
Markdown("### Incremental Jupyter Notebook Code Backup 8")
# ! cp "Project 4 FEMST Concrete Strength Predict.ipynb" "Project 4 FEMST Concrete Strength Predict Backup 8.ipynb"
! ls -l Project*.ipynb
Employ Techniques to squeeze that extra performance out of the model without making it over fit. Use ***Grid Search or Random Search*** on any of the two models used above. Make a DataFrame to compare models after hyperparameter tuning and their metrics as above.
# Instantiate a model for RandomForestRegressor:
model_rgr = RandomForestRegressor(n_estimators=50)
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
"max_features": sp_randint(1, 7),
"min_samples_split": sp_randint(2, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["mse", "mae"]}
# run randomized search
# n_iter = number of random samples
randomCV = RandomizedSearchCV(model_rgr, param_distributions=param_dist, n_iter=10) #default cv = 3
# Fit / Run RandomizedSearchCV for RandomForestRegressor :
randomCV.fit(X, y)
print(randomCV.best_params_)
len(randomCV.cv_results_['mean_test_score']) # MSB: These many model fits (runs)
randomCV.cv_results_['mean_test_score'] # :MSB
# Instantiate a model for GradientBoostingRegressor:
model_gbr = GradientBoostingRegressor()
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
"max_features": sp_randint(1, 7),
"min_samples_split": sp_randint(2, 11),
"min_samples_leaf": sp_randint(1, 11),
"learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
"criterion": ["mse", "mae", 'friedman_mse']}
# run randomized search
# n_iter = number of random samples
randomCV = RandomizedSearchCV(model_gbr, param_distributions=param_dist, n_iter=10) #default cv = 3
# Fit / Run RandomizedSearchCV for GradientBoostingRegressor :
! date
randomCV.fit(X, y)
! date
print(randomCV.best_params_)
len(randomCV.cv_results_['mean_test_score']) # MSB: These many model fits (runs)
randomCV.cv_results_['mean_test_score'] # :MSB
# Based on RandomSearchCV above^ the following is BEST Model / Algorithm with these params:
# Build the FINAL Model & validate it on the UNSEEN / RESERVED TEST dataset:
# Gradient Boost Learning Model:
gbcl = GradientBoostingRegressor(criterion='mse', learning_rate=0.6, max_depth=3, random_state=3,
max_features=6, min_samples_leaf=10, min_samples_split=2)
gbcl = gbcl.fit(X_trn, y_trn)
# y_predict = gbcl.predict(X_test)
# y_predict_gb = y_predict
# print(gbcl.score(X_test, y_test))
print(gbcl.score(X_trn, y_trn))
print(gbcl.score(X_val, y_val))
# *** FINAL SCORE ON the UNSEEN TEST DATASET: ***
print(gbcl.score(X_tst, y_tst))
# Perform KFold on this FINAL BEST MODEL to check the Accuracy:
kfold = KFold(n_splits=50, random_state=7)
model = GradientBoostingRegressor(criterion='mse', learning_rate=0.6, max_depth=3, random_state=3,
max_features=6, min_samples_leaf=10, min_samples_split=2)
results = cross_val_score(model, X_trn, y_trn, cv=kfold) # , scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
### My Housekeeping: Jupyter Code File Incremental Backup 7 Fri.Jul.10 07:59pm ^^^
Markdown("### Incremental Jupyter Notebook Code Backup 7")
# ! cp "Project 4 FEMST Concrete Strength Predict.ipynb" "Project 4 FEMST Concrete Strength Predict Backup 7.ipynb"
! ls -l Project*.ipynb
### My Housekeeping: Jupyter Code File Incremental Backup 9 Sat.Jul.11 01:15am ^^^
Markdown("### Incremental Jupyter Notebook Code Backup 9")
! cp "Project 4 FEMST Concrete Strength Predict.ipynb" "Project 4 FEMST Concrete Strength Predict Backup 9.ipynb"
! ls -l Project*.ipynb